# data from http://insideairbnb.com/get-the-data.html
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
import json
sns.set(rc={'figure.figsize':(11, 4)})
calendar = pd.read_csv('data/calendar.csv', index_col='date', parse_dates=True)
listings = pd.read_csv('./data/listings.csv', parse_dates=True)
reviews = pd.read_csv('./data/reviews.csv', parse_dates=True)
listings['price'] = listings['price'].str[1:].str.split(',').str.join('').astype(float)
listings['amenities'] = listings['amenities'].map( lambda x : [elem.strip('""') for elem in x.strip('{}').split(',')])
calendar['Year'] = calendar.index.year
calendar['Month'] = calendar.index.month
calendar['month_name'] = calendar.index.month_name()
calendar['Weekday Name'] = calendar.index.weekday_name
calendar['Weekday No'] = calendar.index.weekday
calendar['weekofyear'] = calendar.index.weekofyear
calendar['price'] = calendar['price'].str[1:].str.split(',').str.join('').astype(float)
calendar['price_available'] = ~calendar.price.isnull()
calendar.head()
reviews.head()
pd.set_option('display.max_colwidth', 10)
listings.head()
m = folium.Map(location=[42.33, -71.07], zoom_start=11.5)
df = listings.groupby(['neighbourhood_cleansed']).id.count().reset_index()
df.columns = ['neighbourhood', 'listings_count']
with open('data/neighbourhoods.geojson') as boston_json:
b_json = json.load(boston_json)
folium.Choropleth(geo_data=b_json,
name='choropleth',
data=df,
columns=['neighbourhood','listings_count'],
fill_color='YlOrRd',
key_on ='properties.neighbourhood',
fill_opacity=0.6,
line_opacity=0.2,
legend_name = 'listings count'
).add_to(m)
colors = {
"Entire home/apt" : "blue",
"Private room" : "green",
"Shared room" : "red"
}
listings['room_type_color'] = listings['room_type'].apply(lambda x: colors[x])
# m = folium.Map(location=[42.33, -71.07], zoom_start=11.5)
listings.apply(lambda row:folium.CircleMarker(location=[row["latitude"], row["longitude"]],
radius=1, color=row['room_type_color'], popup=row['neighbourhood'])
.add_to(m), axis=1)
m
pd.set_option('display.max_colwidth', 50)
df.sort_values("listings_count", ascending = False)
import matplotlib.pyplot as plt
data = listings.groupby(['property_type']).id.count().reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
data.columns = ['property type', 'number of properties']
data = data.sort_values('number of properties', ascending=False)
g = sns.barplot(data=data, x='property type', y='number of properties')
plt.xticks(rotation=90)
plt.show()
listings.groupby('room_type').id.count()
import matplotlib.pyplot as plt
df = listings.groupby(['neighbourhood_cleansed']).agg({'availability_30':'mean', 'availability_60':'mean', 'availability_90':'mean', 'availability_365':'mean'}).sort_values('availability_365').reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
data = pd.melt(df, ['neighbourhood_cleansed'])
data.columns = ['neighborhood', 'label', 'number of times']
g = sns.lineplot(data=data, x='neighborhood', y='number of times', hue='label', style='label')
plt.xticks(rotation=90)
plt.show()
import matplotlib.pyplot as plt
# 'review_scores_location':'mean',
df = listings.groupby('neighbourhood_cleansed').agg({'review_scores_cleanliness':'mean',
'review_scores_checkin':'mean', 'review_scores_communication':'mean',
'review_scores_rating':'mean'})\
.sort_values('review_scores_checkin').drop('review_scores_rating', axis=1).reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
data = pd.melt(df, ['neighbourhood_cleansed'])
data.columns = ['neighborhood', 'label', 'avg Rating']
g = sns.lineplot(data=data, x='neighborhood', y='avg Rating', hue='label', style='label')
plt.xticks(rotation=90)
plt.show()
import matplotlib.pyplot as plt
df = listings.groupby(['neighbourhood_cleansed', 'host_response_time']).agg({'id':'count'}).sort_values(['host_response_time','id']).reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
df.columns = ['neighborhood', 'host_response_time', 'count']
df2= listings.groupby('neighbourhood_cleansed').id.count().reset_index()
df2.columns = ['neighborhood', 'listings count']
df = pd.merge(df, df2, on='neighborhood')
df['response per listings'] = df['count'] / df['listings count']
df = pd.pivot_table(df, values='response per listings', index=['neighborhood'], columns=['host_response_time']).sort_values('within an hour')
df = pd.melt(df.reset_index(), id_vars=['neighborhood'], value_vars=['a few days or more', 'within a day', 'within a few hours', 'within an hour'])
df.columns = ['neighborhood', 'host_response_time', 'response per listings']
g = sns.lineplot(data=df, x='neighborhood', y='response per listings', hue='host_response_time', style='host_response_time')
plt.xticks(rotation=90)
plt.show()
df
import matplotlib.pyplot as plt
df = pd.merge(listings[['id', 'neighbourhood_cleansed']], calendar[calendar['available']=='t'][['listing_id', 'price', 'month_name']], left_on='id', right_on='listing_id')
df = df.groupby(['neighbourhood_cleansed', 'month_name']).agg({'price':'mean'}).sort_values('price').reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
df.columns = ['neighborhood', 'month', 'price ($)']
g = sns.lineplot(data=df, x='neighborhood', y='price ($)', hue='month', style="month")
plt.xticks(rotation=90)
plt.show()
import matplotlib.pyplot as plt
df = pd.merge(listings[['id', 'neighbourhood_cleansed']], calendar[calendar['available']=='t'][['listing_id', 'price', 'Weekday Name']], left_on='id', right_on='listing_id')
df = df.groupby(['neighbourhood_cleansed', 'Weekday Name']).agg({'price':'mean'}).sort_values('price').reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
df.columns = ['neighborhood', 'Weekday Name', 'price ($)']
g = sns.lineplot(data=df, x='neighborhood', y='price ($)', hue='Weekday Name', style="Weekday Name")
plt.xticks(rotation=90)
plt.show()
import seaborn as sns
#df = calendar.groupby('Weekday No')['adjusted_price']
# give proper title and labels
available = calendar[calendar['available']=='t']
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="Weekday Name", y="price", data=available.sort_values("Weekday No"))
ax.set_ylim(190,205)
import matplotlib.pyplot as plt
df = pd.merge(listings[['id', 'neighbourhood_cleansed', 'room_type']], calendar[calendar['available']=='t'][['listing_id', 'price']], left_on='id', right_on='listing_id')
df = df.groupby(['neighbourhood_cleansed', 'room_type']).agg({'price':'mean'}).sort_values(['room_type','price']).reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
df.columns = ['neighborhood', 'room_type', 'price ($)']
g = sns.lineplot(data=df, x='neighborhood', y='price ($)', hue='room_type', style='room_type')
plt.xticks(rotation=90)
plt.show()
import numpy as np
df = listings[['room_type', 'amenities']].explode('amenities')
df['count'] = 1
df = pd.pivot_table(df, values='count', index=['amenities'], columns=['room_type'], aggfunc={'count':np.sum}).sort_values('Entire home/apt', ascending=False)
df = (df/listings.groupby('room_type').id.count()).sort_values(['Entire home/apt', 'Private room', 'Shared room'], ascending=False)
df = df[df.index != '']
df.head(10)
df.shape
df.tail(10)
normal_amenities = set(df.head(10).index.values)
listings['has_all_common_amenities'] = listings.amenities.apply(lambda x: len(normal_amenities - set(x)) == 0)
# df = listings.groupby('normal_amenities').price.mean()
sns.set(rc={'figure.figsize':(10,7)})
sns.boxplot(x="has_all_common_amenities", y="price", data=listings)
listings.groupby('has_all_common_amenities').price.mean().reset_index()
days_old = 90
listings['host_since'] = pd.to_datetime(listings['host_since'])
max_host = listings['host_since'].max()
listings['host_since(days)'] = (max_host - listings['host_since']).dt.days
df = listings[listings['host_since(days)']<days_old].groupby(['neighbourhood_cleansed']).id.count().reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
df.columns = ['neighbourhood', 'new_listings_count']
df2 = listings[listings['host_since(days)']>=days_old].groupby(['neighbourhood_cleansed']).id.count().reset_index()
df2.columns = ['neighbourhood', 'old_listings_count']
df = pd.merge(df, df2, on="neighbourhood")
df['percentage'] = df['new_listings_count']/df['old_listings_count']
df = df.sort_values('percentage', ascending=False)
df = df.sort_values('new_listings_count', ascending=False)
font_color = '#525252'
hfont = {'fontname':'Calibri'}
facecolor = '#eaeaf2'
color_red = '#fd625e'
color_blue = '#01b8aa'
index = df.neighbourhood
column0 = df['percentage']
column1 = df['new_listings_count']
title0 = '% change'
title1 = 'new listings count'
fig, axes = plt.subplots(figsize=(10,5), facecolor=facecolor, ncols=2, sharey=True)
fig.tight_layout()
axes[0].barh(index, column0, align='center', color=color_red, zorder=10)
axes[0].set_title(title0, fontsize=18, pad=15, color=color_red, **hfont)
axes[1].barh(index, column1, align='center', color=color_blue, zorder=10)
axes[1].set_title(title1, fontsize=18, pad=15, color=color_blue, **hfont)
# If you have positive numbers and want to invert the x-axis of the left plot
axes[0].invert_xaxis()
# To show data from highest to lowest
plt.gca().invert_yaxis()
for label in (axes[0].get_xticklabels() + axes[0].get_yticklabels()):
label.set(fontsize=13, color=font_color, **hfont)
for label in (axes[1].get_xticklabels() + axes[1].get_yticklabels()):
label.set(fontsize=13, color=font_color, **hfont)
plt.subplots_adjust(wspace=0, top=0.85, bottom=0.1, left=0.18, right=0.95)
plt.show()
df
#recent
days_old = 90
listings['host_since'] = pd.to_datetime(listings['host_since'])
max_host = listings['host_since'].max()
listings['host_since(days)'] = (max_host - listings['host_since']).dt.days
df = listings[listings['host_since(days)']<days_old].groupby(['room_type']).id.count().reset_index()
df.columns = ['neighbourhood', 'new_rooms']
#overall
df2 = listings[listings['host_since(days)']>=days_old].groupby(['room_type']).id.count().reset_index()
sns.set(rc={'figure.figsize':(15,8.27)})
df2.columns = ['neighbourhood', 'all_rooms']
df = pd.merge(df, df2, on="neighbourhood")
df['percentage'] = df['new_rooms']/df['all_rooms']
df = df.sort_values('percentage', ascending=False)
g = sns.barplot(data=df, x='neighbourhood', y='percentage', palette="Set1")
sns.set(rc={'figure.figsize':(15,8.27)})
# plt.xticks(rotation=45)
plt.show()
df